Reduce ================= 对指定维度进行归约。 输入: - **src_data** - 输入数据的地址 - **param** - 算子计算所需参数的结构体。其各成员见下述。 - **core_mask** - 核掩码。 **ReduceParameter定义:** .. code-block:: c :linenos: typedef struct ReduceParameter { void** data_buffers_; // 用于存储中间计算结果 int* outer_sizes_; // 处理某个规约轴时,该轴之前所有轴的元素数 int* inner_sizes_; // 某个规约轴之后的所有元素数 int* axis_sizes_; // 规约轴的元素数 int total_num_; // 输入张量的总元素数 int num_axes_; // 待规约轴的数目 int mode_; // 规约模式 int output_num_; // 输出张量的总元素数 /**该算子会根据ReduceParameter中的mode_参数选择实际规约所使用的方法。共有如下几种方法: Reduce_Mean=0, Reduce_Max=1, Reduce_Min=2, Reduce_Prod=3, Reduce_Sum=4, Reduce_SumSquare=5, Reduce_ASum=6, Reduce_L2Norm=7 **/ } ReduceParameter; 输出: - **dst_data** - 输出地址。 支持平台: ``FT78NE`` ``MT7004`` .. note:: - FT78NE 支持int8, int16, int32, fp32, fp64 - MT7004 支持fp16, fp32, int16, int32 **共享存储版本:** .. c:function:: void i8_reduce_s(int8_t* src_data, int8_t* dst_data, ReduceParameter* param, int core_mask) .. c:function:: void i16_reduce_s(int16_t* src_data, half* dst_data, ReduceParameter* param, int core_mask) .. c:function:: void i32_reduce_s(int* src_data, float* dst_data, ReduceParameter* param, int core_mask) .. c:function:: void hp_reduce_s(half* src_data, half* dst_data, ReduceParameter* param, int core_mask) .. c:function:: void fp_reduce_s(float* src_data, float* dst_data, ReduceParameter* param, int core_mask) .. c:function:: void dp_reduce_s(double* src_data, double* dst_data, ReduceParameter* param, int core_mask) **C调用示例:** .. code-block:: c :linenos: :emphasize-lines: 50 void PackParam(ReduceParameter* param, int ndim, int* input_shape, int num_axes, int* axes) { int tmp_input_shape[8]; int total_num = 1; int i, j, k; for (i = 0; i < ndim; i++) { tmp_input_shape[i] = input_shape[i]; total_num *= input_shape[i]; } param->total_num_ = total_num; int offset_size = 0; for (i = 0; i < num_axes; ++i) { int axis = axes[i]; int outer_size = 1; for (j = 0; j < axis; j++) { outer_size *= tmp_input_shape[j]; } param->outer_sizes_[offset_size] = outer_size; int inner_size = 1; for (k = axis + 1; k < ndim; k++) { inner_size *= tmp_input_shape[k]; } param->inner_sizes_[offset_size] = inner_size; param->axis_sizes_[offset_size] = tmp_input_shape[axis]; offset_size++; tmp_input_shape[axis] = 1; } } void TestReduceSMCFp32(int* input_shape, int ndim, int* axes, int num_axes, int mode, int keep_dims, int core_mask) { int core_id = get_core_id(); int logic_core_id = GetLogicCoreId(core_mask, core_id); int core_num = GetCoreNum(core_mask); float* input = (float*)0x88000000; float* output = (float*)0x98000000; ReduceParameter* param = (ReduceParameter*)0xA8480000; if (logic_core_id == 0) { param->num_axes_ = num_axes; param->mode_ = mode; param->data_buffers_ = (void**)0xA8483000; param->inner_sizes_ = (int*)0xA8484000; param->outer_sizes_ = (int*)0xA8485000; param->axis_sizes_ = (int*)0xA8486000; int i; for (i = 0; i < num_axes - 1; i++) { param->data_buffers_[i] = (void*)(0xA8490000 + 0x1000000); } PackParam(param, ndim, input_shape, num_axes, axes); } sys_bar(0, core_num); // 初始化参数完成后进行同步 fp_reduce_s(input, check, param, core_mask); } void main(){ int input_shape[3] = {4, 5, 5}; int ndim = 3; int axes[1] = {1}; int num_axes = 1; int mode = 7; int keep_dims = 1; int core_mask = 0b1111; TestReduceSMCFp32(input_shape, ndim, axes, num_axes, mode, keep_dims, core_mask); } **私有存储版本:** .. c:function:: void i8_reduce_p(int8_t* src_data, int8_t* dst_data, void* tmp_src_data, void* tmp_dst_data, ReduceParameter* param, int core_mask) .. c:function:: void i16_reduce_p(int16_t* src_data, half* dst_data, void* tmp_src_data, void* tmp_dst_data, ReduceParameter* param, int core_mask) .. c:function:: void i32_reduce_p(int* src_data, float* dst_data, void* tmp_src_data, void* tmp_dst_data, ReduceParameter* param, int core_mask) .. c:function:: void hp_reduce_p(half* src_data, half* dst_data, void* tmp_src_data, void* tmp_dst_data, ReduceParameter* param, int core_mask) .. c:function:: void fp_reduce_p(float* src_data, float* dst_data, void* tmp_src_data, void* tmp_dst_data, ReduceParameter* param, int core_mask) .. c:function:: void dp_reduce_p(double* src_data, double* dst_data, void* tmp_src_data, void* tmp_dst_data, ReduceParameter* param, int core_mask) **C调用示例:** .. code-block:: c :linenos: :emphasize-lines: 59 void PackParam(ReduceParameter* param, int ndim, int* input_shape, int num_axes, int* axes) { int tmp_input_shape[8]; int total_num = 1; int i, j, k; for (i = 0; i < ndim; i++) { tmp_input_shape[i] = input_shape[i]; total_num *= input_shape[i]; } param->total_num_ = total_num; int offset_size = 0; for (i = 0; i < num_axes; ++i) { int axis = axes[i]; int outer_size = 1; for (j = 0; j < axis; j++) { outer_size *= tmp_input_shape[j]; } param->outer_sizes_[offset_size] = outer_size; int inner_size = 1; for (k = axis + 1; k < ndim; k++) { inner_size *= tmp_input_shape[k]; } param->inner_sizes_[offset_size] = inner_size; param->axis_sizes_[offset_size] = tmp_input_shape[axis]; offset_size++; tmp_input_shape[axis] = 1; } } void TestReduceL2Fp32(int* input_shape, int ndim, int* axes, int num_axes, int mode, int keep_dims, int core_mask) { float* input = (float*)0x10000000; // 原始输入输出数据需分配在AM中 float* output = (float*)0x10010000; float* tmp_input = (float*)0x88000000; // 临时输入输出空间需分配在DDR或SMC中 float* tmp_output = (float*)0x98000000; ReduceParameter* param = (ReduceParameter*)0x10020000; param->num_axes_ = num_axes; param->mode_ = mode; param->data_buffers_ = (void**)0x10021000; param->inner_sizes_ = (int*)0x10022000; param->outer_sizes_ = (int*)0x10023000; param->axis_sizes_ = (int*)0x10024000; int i, j; for (i = 0; i < ndim; i++) { int reduce_axis = 0; for (j = 0; j < num_axes; j++) { if (axes[j] == i) { reduce_axis = 1; break; } } if (!reduce_axis) { length *= input_shape[i]; } } for (i = 0; i < num_axes - 1; i++) { param->data_buffers_[i] = (void*)(0xA8490000 + 0x1000000); // 每一个中间计算结果空间都需分配在DDR或SMC中 } param->output_num_ = length; PackParam(param, ndim, input_shape, num_axes, axes); fp_reduce_p(input, check, param, core_mask); } void main() { int input_shape[3] = {4, 5, 5}; int ndim = 3; int axes[1] = {1}; int num_axes = 1; int mode = 7; int keep_dims = 1; int core_mask = 0b0001; // 私有存储版本只能设置为一个核心启动 TestReduceL2Fp32(input_shape, ndim, axes, num_axes, mode, keep_dims, core_mask); }